-Looking through the datasets we have available -Need Google Trend data for search terms such as “Cruz” vs “Rourke” or “midterm election” to see if number of searches correlate with # votes for each candidate or # registered voters -What kind of analysis do we want to see -Review the sample final projects
Some links to keep in mind: https://www.cnn.com/election/2018/exit-polls/texas/senate https://www.texastribune.org/2018/10/31/ut-tt-poll-texans-say-immigration-border-security-top-issues/
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.0.0 ✔ purrr 0.2.5
## ✔ tibble 1.4.2 ✔ dplyr 0.7.6
## ✔ tidyr 0.8.1 ✔ stringr 1.3.1
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## ── Conflicts ────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(rvest)
## Loading required package: xml2
##
## Attaching package: 'rvest'
## The following object is masked from 'package:purrr':
##
## pluck
## The following object is masked from 'package:readr':
##
## guess_encoding
library(httr)
library(ggridges)
##
## Attaching package: 'ggridges'
## The following object is masked from 'package:ggplot2':
##
## scale_discrete_manual
library(tigris)
## To enable
## caching of data, set `options(tigris_use_cache = TRUE)` in your R script or .Rprofile.
##
## Attaching package: 'tigris'
## The following object is masked from 'package:graphics':
##
## plot
library(leaflet)
library(dplyr)
library(purrr)
url = "https://www.nytimes.com/elections/results/texas-senate"
nytimes_data = read_html(url, col_types = "ccdd")
nytimes_data
## {xml_document}
## <html lang="en" itemscope="" xmlns:og="http://opengraphprotocol.org/schema/" itemtype="http://schema.org/NewsArticle">
## [1] <head>\n<title>Texas Senate Election Results: Beto O’Rourke vs. Ted ...
## [2] <body class="eln-race-page eln-2018-11-06 eln-forecast">\n<script ty ...
nytimes_data %>%
html_nodes(css = "table")
## {xml_nodeset (2)}
## [1] <table class="eln-table eln-results-table">\n<thead><tr>\n<th class= ...
## [2] <table class="eln-table eln-county-table">\n<thead><tr>\n<th class=" ...
This seems to have created two tables from the website data.
table_overall = (nytimes_data %>% html_nodes(css = "table")) %>%
.[[1]] %>%
html_table()
#remove the commas in the numbers first
table_overall$Votes <- gsub(",","",table_overall$Votes)
table_overall$Pct. <- gsub("%","",table_overall$Pct.)
#convert votes and percent to numeric
table_overall =
table_overall %>%
mutate(Votes = as.numeric(Votes), Pct. = as.numeric(Pct.))
bar_overall =
table_overall %>%
mutate(Candidate = as.factor(Candidate)) %>%
mutate(Candidate = forcats::fct_reorder(Candidate, Votes)) %>%
ggplot(aes(x = Candidate, y = Votes, fill = Party)) +
geom_bar(stat = "identity") +
labs(
title = "Number of Votes per Candidate",
x = "Candidate",
y = "Number of Votes"
) +
scale_x_discrete(labels = c("Cruz* Ted Cruz*" = "Ted Cruz", "O'Rourke Beto O'Rourke" = "Beto O'Rourke",
"Dikeman Neal Dikeman" = "Neal Dikeman"))
bar_overall

Made the first table that which we have final results for the state of texas.
table_county = (nytimes_data %>% html_nodes(css = "table")) %>%
.[[2]] %>%
html_table() %>%
slice(1:(n() - 1)) %>%
select(-Rpt.) %>%
janitor::clean_names()
#remove the commas in the numbers first
table_county$cruz <- gsub(",","",table_county$cruz)
table_county$o_rourke <- gsub(",","",table_county$o_rourke)
table_county$dikeman <- gsub(",","",table_county$dikeman)
table_county =
table_county %>%
mutate(cruz = as.numeric(cruz), o_rourke = as.numeric(o_rourke), dikeman = as.numeric(dikeman))
table_county$county = str_replace(table_county$county,"La Vaca", "Lavaca")
table_county$county = str_replace(table_county$county,"De Witt", "DeWitt")
#county names misspelled (was causing issues with merging later on)
table_county_long = gather(table_county, key = candidate, value = votes, cruz:dikeman)
Made the second table which has all of the 254 county level data for Texas!
point_county =
table_county_long %>%
filter(candidate != "dikeman") %>%
ggplot(aes(x = county, y = votes, color = candidate)) +
geom_point() +
labs(
title = "Number of Votes for Beto vs Cruz",
x = "County",
y = "Number of Votes"
) +
theme(axis.text.x = element_text(angle = 90), text = element_text(size = 3))
point_county

ggplot(table_county_long, aes(x = candidate, y = votes)) + geom_boxplot()

ggplot(table_county_long, aes(x = votes, y = candidate)) +
geom_density_ridges(scale = .85)
## Picking joint bandwidth of 1080

-using search terms “Midterms” and selecting dataset from top result
district_searches = read_csv(file = "./data/Search_Data_US_Congressional_District_26Sep2018.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## District = col_character(),
## Code = col_character(),
## State = col_character(),
## FIRST = col_character(),
## SECOND = col_character(),
## THIRD = col_character(),
## FOURTH = col_character(),
## FIFTH = col_character(),
## SIXTH = col_character(),
## SEVENTH = col_character(),
## EIGHTH = col_character(),
## NINTH = col_character(),
## TENTH = col_character(),
## `Maternity leave in the United States` = col_integer(),
## `Single-payer healthcare` = col_integer(),
## `Tax Cuts and Jobs Act of 2017` = col_integer(),
## `Transgender people in the military` = col_integer()
## )
## See spec(...) for full column specifications.
TX_searches =
district_searches %>% janitor::clean_names() %>%
filter(state == "TX")
TX_counts =
TX_searches %>%
count(fifth)
#the topics that were searched most were health care, immigration, mental health, united nations
#second most searched: immigration, health care, Medicare, Medicaid, capital punishment
#third most searched: Medicare, Medicaid, September 11 attacks, immigration,
#fourth most searched: Medicaid, Medicare, Immigration...
#fifth: Medicaid, mental health, Medicare, September 11 attacks...
#Thus, we should focus on the variables health care, immigration, Medicare, Medicaid, Mental health, September 11 attacks
congress_district = read.csv(file = "./data/congress_district2.csv")
congress_district$county_name = str_replace(congress_district$county_name,"Sterlin", "Sterling")
congress_district$county_name = str_replace(congress_district$county_name,"MuCulloch","McCulloch")
#misspelled counties discovered while exploring data when merging later on
TX_searches =
TX_searches %>%
separate(code, into = c("remove_1", "district_num"), sep = "-") %>%
mutate(district_num = as.numeric(district_num)) %>%
select(district_num, most_searched = first, x2003_invasion_of_iraq:womens_health)
nested_congress =
congress_district %>%
nest(county_name)
merged_searches= merge(TX_searches, nested_congress, by="district_num", all=TRUE) %>%
unnest()
#rename variable county_name
merged_searches = merged_searches %>%
select(district_num, county=county_name, everything())
merged_nyt_searches = merge(merged_searches, table_county, by= "county", all=TRUE)
Still figuring out how to display this ideas: 1) interactive barchart in the current long format, if we use shiny we could show how the top topics vary among counties through use of drop-down menu to select county, etc. 2) figure out we can juxtapose how the counties voted vs. topics. Use of plotly for interactivity? 3) Alternative to first option, how can we show the distribution of topics among counties instead? 4)focus on 5 biggest counties or districts? but this would be biased as it may be a metropolitan area
merged_nyt_searches_long = gather(merged_nyt_searches, key = topics, value = search_interest, health_care:september_11_attacks)
merged_nyt_searches = read_csv(file ="./data/merged_nyt_searches.csv") #previously merged_nyt_searches dataframe into a csv file
## Parsed with column specification:
## cols(
## .default = col_double(),
## county = col_character(),
## district_num = col_integer(),
## most_searched = col_character(),
## gender_workplace_diversity = col_integer(),
## maternity_leave_in_the_united_states = col_integer(),
## single_payer_healthcare = col_integer(),
## tax_cuts_and_jobs_act_of_2017 = col_integer(),
## transgender_people_in_the_military = col_integer(),
## cruz = col_integer(),
## o_rourke = col_integer(),
## dikeman = col_integer()
## )
## See spec(...) for full column specifications.
gis = merged_nyt_searches %>%
select(county, most_searched, cruz, o_rourke, dikeman) %>%
mutate(cruz_orour_rat = cruz/o_rourke,
status = as.numeric(cruz_orour_rat>=1),
winner = ifelse(status==1, "Ted Cruz", "Beto O'Rourke"))
gis = unique(gis)
gis = gis %>%
nest(most_searched)
listcollapse = function(x){
paste(unlist(x), collapse = ", ")
}
gis = gis %>%
mutate( most_searched = map(data, listcollapse)) %>%
select(-data)
Cleaning data so as to merge with GIS…
texas_counties = counties(state = "48",year = "2017") #Texas fips code is 48, using most recent census data, 2017
##
|
| | 0%
|
| | 1%
|
|= | 1%
|
|= | 2%
|
|== | 2%
|
|== | 3%
|
|== | 4%
|
|=== | 4%
|
|=== | 5%
|
|==== | 5%
|
|==== | 6%
|
|==== | 7%
|
|===== | 7%
|
|===== | 8%
|
|====== | 8%
|
|====== | 9%
|
|====== | 10%
|
|======= | 10%
|
|======= | 11%
|
|======= | 12%
|
|======== | 12%
|
|======== | 13%
|
|========= | 13%
|
|========= | 14%
|
|========= | 15%
|
|========== | 15%
|
|========== | 16%
|
|=========== | 16%
|
|=========== | 17%
|
|=========== | 18%
|
|============ | 18%
|
|============ | 19%
|
|============= | 19%
|
|============= | 20%
|
|============= | 21%
|
|============== | 21%
|
|============== | 22%
|
|=============== | 22%
|
|=============== | 23%
|
|=============== | 24%
|
|================ | 24%
|
|================ | 25%
|
|================= | 25%
|
|================= | 26%
|
|================= | 27%
|
|================== | 27%
|
|================== | 28%
|
|=================== | 28%
|
|=================== | 29%
|
|=================== | 30%
|
|==================== | 30%
|
|==================== | 31%
|
|==================== | 32%
|
|===================== | 32%
|
|===================== | 33%
|
|====================== | 33%
|
|====================== | 34%
|
|====================== | 35%
|
|======================= | 35%
|
|======================= | 36%
|
|======================== | 36%
|
|======================== | 37%
|
|======================== | 38%
|
|========================= | 38%
|
|========================= | 39%
|
|========================== | 39%
|
|========================== | 40%
|
|========================== | 41%
|
|=========================== | 41%
|
|=========================== | 42%
|
|============================ | 42%
|
|============================ | 43%
|
|============================ | 44%
|
|============================= | 44%
|
|============================= | 45%
|
|============================== | 45%
|
|============================== | 46%
|
|============================== | 47%
|
|=============================== | 47%
|
|=============================== | 48%
|
|================================ | 48%
|
|================================ | 49%
|
|================================ | 50%
|
|================================= | 50%
|
|================================= | 51%
|
|================================= | 52%
|
|================================== | 52%
|
|================================== | 53%
|
|=================================== | 53%
|
|=================================== | 54%
|
|=================================== | 55%
|
|==================================== | 55%
|
|==================================== | 56%
|
|===================================== | 56%
|
|===================================== | 57%
|
|===================================== | 58%
|
|====================================== | 58%
|
|====================================== | 59%
|
|======================================= | 59%
|
|======================================= | 60%
|
|======================================= | 61%
|
|======================================== | 61%
|
|======================================== | 62%
|
|========================================= | 62%
|
|========================================= | 63%
|
|========================================= | 64%
|
|========================================== | 64%
|
|========================================== | 65%
|
|=========================================== | 65%
|
|=========================================== | 66%
|
|=========================================== | 67%
|
|============================================ | 67%
|
|============================================ | 68%
|
|============================================= | 68%
|
|============================================= | 69%
|
|============================================= | 70%
|
|============================================== | 70%
|
|============================================== | 71%
|
|============================================== | 72%
|
|=============================================== | 72%
|
|=============================================== | 73%
|
|================================================ | 73%
|
|================================================ | 74%
|
|================================================ | 75%
|
|================================================= | 75%
|
|================================================= | 76%
|
|================================================== | 76%
|
|================================================== | 77%
|
|================================================== | 78%
|
|=================================================== | 78%
|
|=================================================== | 79%
|
|==================================================== | 79%
|
|==================================================== | 80%
|
|==================================================== | 81%
|
|===================================================== | 81%
|
|===================================================== | 82%
|
|====================================================== | 82%
|
|====================================================== | 83%
|
|====================================================== | 84%
|
|======================================================= | 84%
|
|======================================================= | 85%
|
|======================================================== | 85%
|
|======================================================== | 86%
|
|======================================================== | 87%
|
|========================================================= | 87%
|
|========================================================= | 88%
|
|========================================================== | 88%
|
|========================================================== | 89%
|
|========================================================== | 90%
|
|=========================================================== | 90%
|
|=========================================================== | 91%
|
|=========================================================== | 92%
|
|============================================================ | 92%
|
|============================================================ | 93%
|
|============================================================= | 93%
|
|============================================================= | 94%
|
|============================================================= | 95%
|
|============================================================== | 95%
|
|============================================================== | 96%
|
|=============================================================== | 96%
|
|=============================================================== | 97%
|
|=============================================================== | 98%
|
|================================================================ | 98%
|
|================================================================ | 99%
|
|=================================================================| 99%
|
|=================================================================| 100%
tx_midterms = geo_join(texas_counties, gis, "NAME", "county")
pal = colorFactor(c("blue", "red"), tx_midterms$status)
popup = paste(sep = "<br/>",
paste0("<b>Winner: </b>", tx_midterms$winner),
paste0("<b>County: </b>", tx_midterms$county),
paste0("<b>Votes for Ted Cruz: </b>", tx_midterms$cruz),
paste0("<b>Votes for Beto O'Rourke: </b>", tx_midterms$o_rourke),
paste0("<b>Votes for Neal Dikeman: </b>", tx_midterms$dikeman),
paste0("<b>Most searched topics over one month prior: </b>", tx_midterms$most_searched))
leaflet(tx_midterms) %>%
addProviderTiles("CartoDB.Positron") %>%
addPolygons(fillColor= ~pal(status),
color = "black",
popup = popup,
fillOpacity = 0.5,
weight = 0.5) %>%
addLegend(position = "bottomleft",
colors = c("blue", "red"),
labels = c("Democrat", "Republican"),
opacity = 1,
title = "2018 Texas Senate Election Results")